In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns; sns.set(style="ticks", color_codes=True)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import fbprophet
from fbprophet import Prophet
import os
from scipy import stats
import sklearn
import tslearn
from tslearn.neighbors import KNeighborsTimeSeries
from tslearn.utils import to_time_series_dataset
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print(os.listdir())
C:\Users\eoind\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:144: FutureWarning:

The sklearn.neighbors.base module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.neighbors. Anything that cannot be imported from sklearn.neighbors is now part of the private API.

['.ipynb_checkpoints', 'Case Retrieval with DTW K-NN.ipynb', 'Data Loading and Preparation.ipynb', 'features.csv', 'Prophet Forecast & Data Augmentation with NN.ipynb', 'Sales_Data.csv', 'Sales_Data_.csv', 'sampleSubmission.csv', 'stores.csv', 'Stores_NN.csv', 'test.csv', 'train.csv', 'Training_Sales_Data.csv']

Prophet Forecasting

In [2]:
df = pd.read_csv('Sales_Data_.csv', index_col = 0)
df.head()
Out[2]:
Store_ID Store_Size Weekly_Sales
2010-02-07 1 151315 1643690.90
2010-02-14 1 151315 1641957.44
2010-02-21 1 151315 1611968.17
2010-02-28 1 151315 1409727.59
2010-03-07 1 151315 1554806.68

Store 1

In [3]:
df_train = df[df.index < '2012-05-20']
df_test = df[df.index >= '2012-05-20']
In [4]:
sample_data = pd.DataFrame(df[df['Store_ID']==1]['Weekly_Sales'])
sample_data['ds'] = sample_data.index
sample_data['ds'] = sample_data.index
sample_data['y'] = sample_data['Weekly_Sales']
sample_data.drop('Weekly_Sales', axis=1, inplace=True)

sample_data_train = sample_data[sample_data.index < '2012-05-20']
sample_data_test = sample_data[sample_data.index >= '2012-05-20']

sample_data.reset_index(inplace=True)
In [5]:
m = Prophet(yearly_seasonality=True)
m.add_country_holidays(country_name='US')
m.fit(sample_data_train)
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
Out[5]:
<fbprophet.forecaster.Prophet at 0x172292e4320>
In [6]:
forecast = m.predict(sample_data_test)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
Out[6]:
ds yhat yhat_lower yhat_upper
19 2012-09-30 1.543806e+06 1.421600e+06 1.668405e+06
20 2012-10-07 1.584501e+06 1.452946e+06 1.719056e+06
21 2012-10-14 1.589013e+06 1.459940e+06 1.723270e+06
22 2012-10-21 1.554754e+06 1.419754e+06 1.685204e+06
23 2012-10-28 1.536245e+06 1.415842e+06 1.659880e+06
In [7]:
from fbprophet.plot import plot_plotly
import plotly.offline as py
py.init_notebook_mode()

fig = plot_plotly(m, forecast)  # This returns a plotly Figure
py.iplot(fig)
m.plot_components(forecast);
In [8]:
def make_comparison_dataframe(historical, forecast):
    
    return forecast.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(historical.set_index('ds'))
In [9]:
pd.plotting.register_matplotlib_converters()
cmp_df = make_comparison_dataframe(sample_data, forecast)
cmp_df.head()
cmp_df[['y', 'yhat']].plot()
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x1722d33a518>
In [10]:
mean_absolute_error(cmp_df['y'].values, cmp_df['yhat'].values)
Out[10]:
57115.32498278134

Store 6 which is the Nearest Neighbour of Store 1

In [11]:
sample_data = pd.DataFrame(df[df['Store_ID']==6]['Weekly_Sales'])
sample_data['ds'] = sample_data.index
sample_data['ds'] = sample_data.index
sample_data['y'] = sample_data['Weekly_Sales']
sample_data.drop('Weekly_Sales', axis=1, inplace=True)
sample_data_train = sample_data[sample_data.index < '2012-05-20']
sample_data_test = sample_data[sample_data.index >= '2012-05-20']
sample_data.reset_index(inplace=True)


m = Prophet(yearly_seasonality=True)
m.add_country_holidays(country_name='US')
m.fit(sample_data_train)

forecast = m.predict(sample_data_test)
py.init_notebook_mode()

fig = plot_plotly(m, forecast)  # This returns a plotly Figure
py.iplot(fig)
m.plot_components(forecast);
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.

Error Analysis

In [12]:
cmp_df = make_comparison_dataframe(sample_data, forecast)
cmp_df.head()
cmp_df[['y', 'yhat']].plot()
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1722d4446d8>
In [13]:
sample_data_train['y']
Out[13]:
2010-02-07    1652635.10
2010-02-14    1606283.86
2010-02-21    1567138.07
2010-02-28    1432953.21
2010-03-07    1601348.82
2010-03-14    1558621.36
2010-03-21    1693058.91
2010-03-28    1472033.38
2010-04-04    1770333.90
2010-04-11    1667181.82
2010-04-18    1519846.36
2010-04-25    1540435.99
2010-05-02    1498080.16
2010-05-09    1619920.04
2010-05-16    1524059.40
2010-05-23    1531938.44
2010-05-30    1644470.66
2010-06-06    1857380.09
2010-06-13    1685652.35
2010-06-20    1677248.24
2010-06-27    1640681.88
2010-07-04    1759777.25
2010-07-11    1690317.99
2010-07-18    1560120.80
2010-07-25    1585240.92
2010-08-01    1532308.78
2010-08-08    1633241.59
2010-08-15    1547654.98
2010-08-22    1539930.50
2010-08-29    1450766.12
                 ...    
2011-10-23    1417922.37
2011-10-30    1419445.12
2011-11-06    1523420.38
2011-11-13    1536176.54
2011-11-20    1524390.07
2011-11-27    2249811.55
2011-12-04    1688531.34
2011-12-11    1903385.14
2011-12-18    2034695.56
2011-12-25    2644633.02
2012-01-01    1598080.52
2012-01-08    1395339.71
2012-01-15    1344243.17
2012-01-22    1326255.70
2012-01-29    1315610.66
2012-02-05    1496305.78
2012-02-12    1620603.92
2012-02-19    1632616.09
2012-02-26    1465187.71
2012-03-04    1550385.65
2012-03-11    1569304.40
2012-03-18    1748010.29
2012-03-25    1495143.62
2012-04-01    1508933.26
2012-04-08    1840131.19
2012-04-15    1616394.45
2012-04-22    1456073.24
2012-04-29    1456221.10
2012-05-06    1543461.12
2012-05-13    1517075.67
Name: y, Length: 119, dtype: float64
In [14]:
cmp_df['yhat']
Out[14]:
ds
2012-05-20    1.552516e+06
2012-05-27    1.622021e+06
2012-06-03    1.655675e+06
2012-06-10    1.662719e+06
2012-06-17    1.670829e+06
2012-06-24    1.682211e+06
2012-07-01    1.674549e+06
2012-07-08    1.639284e+06
2012-07-15    1.599262e+06
2012-07-22    1.580411e+06
2012-07-29    1.578424e+06
2012-08-05    1.568071e+06
2012-08-12    1.541322e+06
2012-08-19    1.518286e+06
2012-08-26    1.513193e+06
2012-09-02    1.504586e+06
2012-09-09    1.458716e+06
2012-09-16    1.381568e+06
2012-09-23    1.326981e+06
2012-09-30    1.338623e+06
2012-10-07    1.393788e+06
2012-10-14    1.425747e+06
2012-10-21    1.406072e+06
2012-10-28    1.386262e+06
Name: yhat, dtype: float64
In [15]:
neighbour_1 = (sample_data_train['y']).append(cmp_df['yhat'], ignore_index=True)

Data Augmentation Explaination

There are several ways to augment the training data for prediction

  1. Feature Augmentation - We predict store 6 using the base model then add the training value and predicted value as regressors for the store 1 prediction.

  2. Instance Augmentation - We add the NN's data (difficult to determine where as the sequence is important)

Lets check the correlation coefficient between these time series as a sanity check.

In [16]:
np.corrcoef((df[df['Store_ID']==6]['Weekly_Sales']).values,(df[df['Store_ID']==1]['Weekly_Sales']).values)
Out[16]:
array([[1.        , 0.83884963],
       [0.83884963, 1.        ]])
In [17]:
sample_data = pd.DataFrame(df[df['Store_ID']==1]['Weekly_Sales'])
sample_data['ds'] = sample_data.index
sample_data['ds'] = sample_data.index
sample_data['y'] = sample_data['Weekly_Sales']
sample_data.drop('Weekly_Sales', axis=1, inplace=True)

sample_data['neighbour_1'] = neighbour_1.values
train = sample_data[sample_data.index < '2012-05-20']
test = sample_data[sample_data.index >= '2012-05-20']

sample_data.reset_index(inplace=True)
sample_data['neighbour_1'] = neighbour_1.values


m = Prophet(yearly_seasonality=True)
m.add_country_holidays(country_name='US')
m.add_regressor('neighbour_1')
m.fit(train)

forecast = m.predict(test)
py.init_notebook_mode()

fig = plot_plotly(m, forecast)  # This returns a plotly Figure
py.iplot(fig)
m.plot_components(forecast);
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
In [18]:
cmp_df = make_comparison_dataframe(sample_data, forecast)
cmp_df[['y', 'yhat']].plot()
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x1722af242b0>
In [19]:
mean_absolute_error(cmp_df['y'].values, cmp_df['yhat'].values) #looks promising but we need to check if it works most cases
Out[19]:
54929.56252306452

Base Prophet Forecast

In [20]:
mae = []
store_identity = list(df.Store_ID.unique()[:])
base_prediction = []


def base_prophet_forecast(store_id, forecast_period):
    
    df_train = df[df.index < forecast_period]
    df_test = df[df.index >= forecast_period]
    
    sample_data = pd.DataFrame(df[df['Store_ID']==store_id]['Weekly_Sales'])
    sample_data['ds'] = sample_data.index
    sample_data['ds'] = sample_data.index
    sample_data['y'] = sample_data['Weekly_Sales']
    sample_data.drop('Weekly_Sales', axis=1, inplace=True)
    
    sample_data_train = sample_data[sample_data.index < forecast_period]
    sample_data_test = sample_data[sample_data.index >= forecast_period]
    sample_data.reset_index(inplace=True)
    
    m = Prophet(yearly_seasonality=True, daily_seasonality=False, weekly_seasonality=False)
    m.add_country_holidays(country_name='US')
    m.fit(sample_data_train)
    forecast = m.predict(sample_data_test)
    
    cmp_df = make_comparison_dataframe(sample_data, forecast)
    mae.append(mean_absolute_error(cmp_df['y'].values, cmp_df['yhat'].values))
    base_prediction.append(cmp_df['yhat'].values)
    
In [21]:
for store in df.Store_ID.unique()[:]:
    
    base_prophet_forecast(store, '2012-05-20')
In [22]:
sns.distplot(mae, bins=len(mae))
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x1722af49b38>
In [23]:
np.median(mae)
Out[23]:
40752.68197095926
In [24]:
base_predictions = dict(zip(store_identity, base_prediction))

Data Augmentations using the 1 NN's

In [25]:
nn_df = pd.read_csv('Stores_NN.csv', index_col = 0)
nn_df.head()
Out[25]:
Store_ID 1-NN
0 1 6
1 2 10
2 3 38
3 4 13
4 5 44
In [26]:
base_predictions.get(4)
Out[26]:
array([2199429.18360289, 2200583.3804252 , 2175962.009921  ,
       2169348.70005547, 2199326.56900975, 2230733.41347038,
       2218074.80519351, 2166068.42455049, 2128561.45257848,
       2145903.77223531, 2199685.64158773, 2239962.87881111,
       2245970.09103604, 2240968.69636229, 2247088.93333957,
       2247893.31991323, 2213964.56216578, 2156414.19222032,
       2128719.94694888, 2164123.38658903, 2228148.66487608,
       2255934.08662892, 2235313.85112718, 2228954.50006573])
In [27]:
df['date'] = df.index
df = pd.merge(df, nn_df, on="Store_ID")
df.index = df['date']
In [28]:
df.head()
Out[28]:
Store_ID Store_Size Weekly_Sales date 1-NN
date
2010-02-07 1 151315 1643690.90 2010-02-07 6
2010-02-14 1 151315 1641957.44 2010-02-14 6
2010-02-21 1 151315 1611968.17 2010-02-21 6
2010-02-28 1 151315 1409727.59 2010-02-28 6
2010-03-07 1 151315 1554806.68 2010-03-07 6
In [29]:
NN_mae = []

def NN_prophet_forecast(store_id, forecast_period):
    
    sample_data = pd.DataFrame(df[df['Store_ID']==store_id]['Weekly_Sales'])
    sample_data['ds'] = sample_data.index
    sample_data['ds'] = sample_data.index
    sample_data['y'] = sample_data['Weekly_Sales']
    sample_data.drop('Weekly_Sales', axis=1, inplace=True)
    
    sample_data['neighbour_1'] = np.concatenate(((df[df['Store_ID'] == df[df['Store_ID']==store_id]['1-NN'][0]]['Weekly_Sales'][:-24].values),
               (base_predictions.get(df[df['Store_ID']==store_id]['1-NN'][0]))),axis=0)
    
    
    sample_data_train = sample_data[sample_data.index < forecast_period]
    sample_data_test = sample_data[sample_data.index >= forecast_period]
    sample_data.reset_index(inplace=True)
    
    m = Prophet(yearly_seasonality=True, daily_seasonality=False, weekly_seasonality=False)
    m.add_country_holidays(country_name='US')
    m.add_regressor('neighbour_1')
    m.fit(sample_data_train)
    forecast = m.predict(sample_data_test)
    
    cmp_df = make_comparison_dataframe(sample_data, forecast)
    NN_mae.append(mean_absolute_error(cmp_df['y'].values, cmp_df['yhat'].values))
    
In [30]:
for store in df.Store_ID.unique()[:]:
    
    NN_prophet_forecast(store, '2012-05-20')
In [31]:
data = zip(mae,NN_mae)
In [32]:
sns.distplot(NN_mae, bins=len(NN_mae))
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x172282b8c18>
In [33]:
np.median(NN_mae)
Out[33]:
44080.25357382671
In [34]:
results_df = pd.DataFrame(data, columns=['Base_Model','NN_Augmented'])
In [35]:
from collections import Counter
Counter(results_df[['Base_Model','NN_Augmented']].idxmin(axis=1))
Out[35]:
Counter({'NN_Augmented': 19, 'Base_Model': 26})

For 26 stores the base model has a better prediction but for 19 stores the NN_Augmented approach works best. It will be useful to select if we want to use the Augmented approach by looking at the distance of the nearest neighbours as some may not be that close at all to the query.